*************************************************************************************************
/*																								
	Purpose: Clean raw data													
	Creates: NLSMW_analysis.dta												
	Note: Sample is women aged 30-44.										
*/																								
*************************************************************************************************

clear 
set more off

cd "$Mydirectory1/1_DataSources/NLS_MatureWomen/"

* Bring in rawdata, make preliminary dta files
	quietly run ./RawData/default-value-labels.do
	quietly run ./RawData/husbandocc-value-labels.do
	quietly run ./RawData/siblings_fertility_hhsize-value-labels.do
	quietly run ./RawData/motherocc_Rage15-value-labels.do
	quietly run ./RawData/who_Rlivew_age15-value-labels.do
	quietly run ./RawData/parents_edu-value-labels.do

	use ./RawData/NLS_MW_raw.dta, clear

*------------------------------------------------------------------------------*
*------------------------------------------------------------------------------*

** Rename useful variables**
	rename R0085500 faminc_bin 
	rename R0080000 faminc_raw 
	rename R0044900 veterancomp_R
	rename R0045700 veterancomp_HH
	rename R0049800 placebornR
	rename R0002406 south_residence
	rename R0000200 weight 
	rename R0037500 highestgrade 
	rename R0037600 highestgrade_comp 
	rename R0070600 empstatusR 
	rename R0051200 fatherocc 
	rename R0002200 age 
	rename R0002203 yearborn
	rename R0002101 interview_month
	rename R0002300 race 
	rename R0002400 marital 
	rename R0050400 birthplace_father

** Add other relevant variables
	merge 1:1 R0000100 using "./RawData/husbandocc.dta"
	drop _merge

	merge 1:1 R0000100 using "./RawData/parents_edu.dta"
	drop _merge
	
	merge 1:1 R0000100 using "./RawData/siblings_fertility_hhsize.dta"
	drop _merge
	
	merge 1:1 R0000100 using "./RawData/motherocc_Rage15.dta"
	drop _merge	
	
	merge 1:1 R0000100 using "./RawData/who_Rlivew_age15.dta"
	drop _merge
	
*------------------------------------------------------------------------------*
*------------------------------------------------------------------------------*

**************
*** RESTRICT SAMPLE
**************

* Age--note: already in desired range
	label var age "Age of respondent"
	tab age, m
	
	gen agesq = age*age
	label var agesq "Age squared"
	
** Foreignborn
	gen foreignborn = placebornR==7 if placebornR!=-4
	label var foreignborn "Respondent is foreign born"
	keep if (foreignborn==0 | foreignborn==.) 
	
*------------------------------------------------------------------------------*
*------------------------------------------------------------------------------*

**************
*** FAMILY INCOME (1967)
**************
/*
The midpoint of each bin is assigned, with the exception of:
 (1) the last bin, whose bottom value is multiplied by 1.25 (as last bin is always "open-ended"--i.e. "25,000 or more")
 (2) the bottom bin, whose top value is multiplied by 0.75
*/ 
	tab faminc_bin, m
	gen fam_inc=.
	replace fam_inc=.75*2000 if faminc_bin==1 
	replace fam_inc=2500 if faminc_bin==2 
	replace fam_inc=3500 if faminc_bin==3 
	replace fam_inc=4500 if faminc_bin==4 
	replace fam_inc=5500 if faminc_bin==5 
	replace fam_inc=6500 if faminc_bin==6 
	replace fam_inc=7500 if faminc_bin==7
	replace fam_inc=9000 if faminc_bin==8
	replace fam_inc=12500 if faminc_bin==9
	replace fam_inc=20000 if faminc_bin==10
	replace fam_inc=1.25*25000 if faminc_bin==11
	label var fam_inc "Family income, midpoints"
	
	* If fam_inc is still missing, use other variables to get family income.
	replace fam_inc=.75*2000 if faminc_raw>=0 & faminc_raw<2000 & fam_inc==.
	replace fam_inc=2500 if faminc_raw>=2000 & faminc_raw<3000 & fam_inc==.
	replace fam_inc=3500 if faminc_raw>=3000 & faminc_raw<4000 & fam_inc==.
	replace fam_inc=4500 if faminc_raw>=4000 & faminc_raw<5000 & fam_inc==.
	replace fam_inc=5500 if faminc_raw>=5000 & faminc_raw<6000 & fam_inc==.
	replace fam_inc=6500 if faminc_raw>=6000 & faminc_raw<7000 & fam_inc==.
	replace fam_inc=7500 if faminc_raw>=7000 & faminc_raw<8000 & fam_inc==.
	replace fam_inc=9000 if faminc_raw>=8000 & faminc_raw<10000 & fam_inc==.
	replace fam_inc=12500 if faminc_raw>=10000 & faminc_raw<15000 & fam_inc==.
	replace fam_inc=20000 if faminc_raw>=15000 & faminc_raw<25000 & fam_inc==.
	replace fam_inc=1.25*25000 if faminc_raw>=25000 & faminc_raw<. & fam_inc==.

/*Note: The suffix "_son" is used to match the variable names in other datasets. 
        All respondents (i.e., female) are given a value for these variables.*/	
	gen bottomcoded_son = fam_inc==.75*2000 if fam_inc<.
	gen topcoded_son = fam_inc==1.25*25000 if fam_inc<.
	label var bottomcoded_son "Respondent family income, bottom coded"
	label var topcoded_son "Respondent family income, top coded"
	
	/* Turn fam_inc into 1950 dollars using the CPI: https://data.bls.gov/timeseries/CUUR0000SA0 */
	gen CPI1950 = 24.1
	gen CPI1967 = 33.4

	gen fam_inc_real =.
	replace fam_inc_real = fam_inc * (CPI1950/CPI1967) 
	label var fam_inc_real "Family income, in 1950 dollars"
	
	gen lnfaminc = ln(fam_inc_real)
	label var lnfaminc "Logged family income"
	
*------------------------------------------------------------------------------*
*------------------------------------------------------------------------------*
	
*********************
*** DEMOGRAPHICS
*********************

*Race
	tab race, m
	replace race=. if race==3
	gen black = (race==2) if race<.
	label var race "Race"

*Marital status
	tab marital, m
	gen married = marital==1 | marital==2
	tab married, m
	label var married "Married"
	
	gen never_married = marital==6
	tab never_married, m
	
	gen divorced = marital==4
	tab divorced, m
	
	gen widowed = marital==3
	tab widowed, m
	
	gen separated = marital==5
	tab separated, m
	
* Veteran status 
	tab veterancomp_R, m
	tab veterancomp_HH, m
	gen veteran_fam = veterancomp_R>0 | veterancomp_HH>0
	label var veteran_fam "R or husband receives veteran's comp."
	
* Fatherforeign
	gen fatherforeign = birthplace_father>1 if birthplace_father>0
	
* South-related
	tab placeborn, m nol
	gen bornsouth = 0 if placeborn>0
	replace bornsouth=1 if placebornR==5
	replace bornsouth=1 if south_residence==1 & (placeborn==1 | placeborn==2) 
	label var bornsouth "R born in Southern states"
	
* Moved 
	gen moved_region =.
	replace moved_region=0 if placeborn==1 | placeborn==2
	replace moved_region=1 if placeborn>=3 & placeborn<=6
	label var moved_region 
	label var moved_region "Moved region" 
	
* Employment of respondent
	tab empstatusR, m nol
	gen employed = empstatusR==1 | empstatus==2 
	label var employed "Respondent is employed"
	
*------------------------------------------------------------------------------*
*------------------------------------------------------------------------------*

*********************
*** EDUCATION
*********************

/* Education for respondent : Adjust years of schooling (based on var "highest grade attended") 
   to reflect grade that R COMPLETED*/
	gen yrsschool =.
	replace yrsschool = highestgrade if highestgrade>0 //Note: Highestgrade is never =0.
	replace yrsschool = highestgrade - 1 if highestgrade_comp==0 & highestgrade>0
	label var yrsschool "Years of schooling completed"
	tab yrsschool, m

	gen eduR=.
	replace eduR=0 if yrsschool==0
	replace eduR=1 if yrsschool>0 & yrsschool<8 //some grade school
	replace eduR=2 if yrsschool==8 //completed grade school
	replace eduR=3 if yrsschool>8 & yrsschool<12 //high school dropout
	replace eduR=4 if yrsschool==12 //high school degree
	replace eduR=5 if yrsschool>12 & yrsschool<16 //some college
	replace eduR=6 if yrsschool>=16 & yrsschool<. //BA (16 yrs) or more
	label var eduR "Education, binned for respondent"
	
	gen hs_ed = eduR>=4 if eduR<.
	gen coll_ed = eduR>=6 if eduR<.
	label var hs_ed "HS educated" 
	label var coll_ed "College educated"
		
	gen yrsschool_bin=.
	replace yrsschool_bin = 0 if yrsschool==0
	replace yrsschool_bin = 6 if yrsschool>0 & yrsschool<8
	replace yrsschool_bin = 8 if yrsschool==8
	replace yrsschool_bin = 10 if yrsschool>8 & yrsschool<12
	replace yrsschool_bin = 12 if yrsschool==12
	replace yrsschool_bin = 14 if yrsschool>12 & yrsschool<16
	replace yrsschool_bin = 16 if yrsschool>=16 & yrsschool<.
	label var yrsschool_bin "Years of school, binned"
	
* Educational categories for father 
	gen edu_dad=.
	replace edu_dad=0 if father_edu==0 //none
	replace edu_dad=1 if father_edu>=1 & father_edu<=7 //some grade school
	replace edu_dad=2 if father_edu==8 //completed 8th grade
	replace edu_dad=3 if (father_edu>=9 & father_edu<=11) | father_edu==97 //some HS
	replace edu_dad=4 if father_edu==12 //4 years of HS
	replace edu_dad=5 if father_edu>=13 & father_edu<=15 //some college
	replace edu_dad=6 if father_edu>=16 & father_edu<=18 //college
	label var edu_dad "Dad education, consistent bins" 
	
	gen dad_hs_ed = edu_dad>=4 & edu_dad<.
	gen dad_coll_ed = edu_dad>=6 & edu_dad<.
	label var dad_hs_ed "Dad HS educated" 
	label var dad_coll_ed "Dad College educated"
	
	gen edu_dad_bin=0 if edu_dad==0 
	replace edu_dad_bin=6 if edu_dad==1 
	replace edu_dad_bin=8 if edu_dad==2 
	replace edu_dad_bin=10 if edu_dad==3 
	replace edu_dad_bin=12 if edu_dad==4 
	replace edu_dad_bin=14 if edu_dad==5 
	replace edu_dad_bin=16 if edu_dad==6 
	tab edu_dad_bin, m
	label var edu_dad_bin "Dad years of schooling using bins"
	
* Educational categories for mother 
	gen edu_mom=.
	replace edu_mom=0 if mother_edu==0 //none
	replace edu_mom=1 if mother_edu>=1 & mother_edu<=7 //some grade school
	replace edu_mom=2 if mother_edu==8 //completed 8th grade
	replace edu_mom=3 if (mother_edu>=9 & mother_edu<=11) | mother_edu==97 //some HS
	replace edu_mom=4 if mother_edu==12 //4 years of HS
	replace edu_mom=5 if mother_edu>=13 & mother_edu<=15 //some college
	replace edu_mom=6 if mother_edu>=16 & mother_edu<=18 //college
	label var edu_mom "Mom education, consistent bins" 
	
	gen mom_hs_ed = edu_mom>=4 & edu_mom<.
	gen mom_coll_ed = edu_mom>=6 & edu_mom<.
	label var mom_hs_ed "Mom HS educated" 
	label var mom_coll_ed "Mom College educated"
	
	gen edu_mom_bin=0 if edu_mom==0 
	replace edu_mom_bin=6 if edu_mom==1 
	replace edu_mom_bin=8 if edu_mom==2 
	replace edu_mom_bin=10 if edu_mom==3 
	replace edu_mom_bin=12 if edu_mom==4 
	replace edu_mom_bin=14 if edu_mom==5 
	replace edu_mom_bin=16 if edu_mom==6 
	tab edu_mom_bin, m
	label var edu_mom_bin "Mom years of schooling using bins"

	rename father_edu yrsschool_dad
	rename mother_edu yrsschool_mom
	
	foreach a in dad mom {
	replace yrsschool_`a' =. if yrsschool_`a'<0 /* DK/VALID SKIP */
	replace yrsschool_`a' =10 if yrsschool_`a'==97 /*some hs, unspecified*/
	tab yrsschool_`a', m
	}

	
*************
* Siblings *
*************

* # of siblings: already coded up as R_num_siblings
* # of brothers--already coded up as R_num_brothers 
* # of sisters--already coded up as R_num_sisters

/* Notes: 
   (1) # of siblings is observed in the 1977 wave, and it is the only year with info on # of siblings.
       There will be some respondents with missing info due to having attrited from the sample between 1967
       and 1977.

   (2) # of brothers and # of sisters are observed in the 1981 wave, and it is the only year that asks
   	   the gender of the respondent's siblings. There will similarly be some respondents with missing 
   	   info due to having attrited from the sample between 1967 and 1981.
*/


*****************
* Own Fertility *     
*****************

* # (living) boys--not available
* Flag indeterminate (high) # of boys--not available
* # (living) girls--not available
* Flag indeterminate (high) # of girls--not available

* # of living kids
/*Note: Includes adopted/step children, biological kids not living in hh, 
        and biological kids living in hh. It does not appear possible to 
        tell if any adopted/step children have died.
*/
	global kids_livinginhh "R_numkids_lessthan6m_livinginhh +  R_numkids_6to35m_livinginhh + R_numkids_3to5yrs_livinginhh + R_numkids_6to13yrs_livinginhh + R_numkids_14to17yrs_livinginhh + R_numkids_18plus_livinginhh"
	di "$kids_livinginhh" //Note: Each of these variables appears to include adopted and step children.
	
	gen R_numadopstepkids_notlivinginhh = R_num_adoptedstepkids -  R_numadoptedstepkids_livinginhh if R_num_adoptedstepkids!=. & R_numadoptedstepkids_livinginhh!=. 
	tab R_numadopstepkids_notlivinginhh,m 

	replace R_numadopstepkids_notlivinginhh =. if R_numadopstepkids_notlivinginhh<0 //replace nonsensical negative values
	label var R_numadopstepkids_notlivinginhh "# of step/adopted children not living in R's hh"
			
	gen R_numkids_living = (R_numbiokids_notlivinginhh + R_numadopstepkids_notlivinginhh + $kids_livinginhh ) if (R_numbiokids_notlivinginhh!=. & R_numadopstepkids_notlivinginhh!=.) 
	
	replace R_numkids_living = R_numbiokids_notlivinginhh + $kids_livinginhh if (R_numbiokids_notlivinginhh!=. & R_numadopstepkids_notlivinginhh==.) 
	replace R_numkids_living = (R_numadopstepkids_notlivinginhh + $kids_livinginhh ) if (R_numbiokids_notlivinginhh==. & R_numadopstepkids_notlivinginhh!=.)	
	replace R_numkids_living = ($kids_livinginhh ) if R_numbiokids_notlivinginhh==. & R_numadopstepkids_notlivinginhh==.
	tab R_numkids_living, m 
	label var R_numkids_living "# of R's kids who are living"

* # of deceased kids--not available
* # of kids ever--not available
* Dummy: has R ever had kids--already coded up

* Dummy: does R have kids right now?
	gen R_kids_now = R_numkids_living!=0 if R_numkids_living<. 
	tab R_kids_now,m 

*********************************
* # of persons in R's household *
*********************************

* # of kids (of any age) living in R's hh
	gen R_totnumkids_livinginhh = $kids_livinginhh 
	tab R_totnumkids_livinginhh, m
	label var R_totnumkids_livinginhh "Total # of kids (of any age) living in R's hh"
	
* # of children 0-17 living in R's hh
	gen R_totnumkids_0to17_livinginhh = $kids_livinginhh - R_numkids_18plus_livinginhh 
	tab R_totnumkids_0to17_livinginhh,m 
	label var R_totnumkids_0to17_livinginhh "Total # of kids (0-17) living in R's hh"
	
* Total # of people living in R's household 
/*Note: Only respondents with all 3 pieces of info about household size 
		(i.e., # of kids <18 in hh, # of hh members 18-64, # of hh members 65+) 
		will receive a nonmissing value for hh size variables.
*/	
	//Married Rs with spouse present
	gen R_hhsize_minusR = (R_totnumkids_livinginhh + 1 /*R's spouse*/ + R_numrel_18to64notspk_livinginhh + R_numrel_65plus_notsp_livinginhh) if (marital==1 /*R's spouse is present*/ & R_numrel_18to64notspk_livinginhh!=. & R_numrel_65plus_notsp_livinginhh!=.) 
		
	//Unmarried Rs or married Rs with absent spouses
	replace R_hhsize_minusR = (R_totnumkids_livinginhh + R_numrel_18to64notspk_livinginhh + R_numrel_65plus_notsp_livinginhh) if (marital!=1 /*R's spouse is not present*/ & R_numrel_18to64notspk_livinginhh!=. & R_numrel_65plus_notsp_livinginhh!=.)
	
	tab R_hhsize_minusR,m 
	
	gen R_hhsize_plusR = R_hhsize_minusR +1 //Note: Adding 1 because  "self" is never an option in the relationship to respondent variables that are used to construct hh size.
	tab R_hhsize_plusR,m 
	
	label var R_hhsize_plusR "total # of persons in R's hh (including R)"
	label var R_hhsize_minusR "total # of persons in R's hh (NOT including R)"	

*--------------------------------------------------------------------------------------*
*--------------------------------------------------------------------------------------*

*********************
*** OCCUPATION: HUSBAND
*********************

	rename husband_occ census1960
	replace census1960=. if census1960<0
	
	merge m:1 census1960 using ../Crosswalks/Crosswalk_1960Census_toANES.dta
		assert census1960==. if _merge==1
		drop if _merge==2
		rename fatheroccej occSPej
		
	tab census1960 if _merge==1
		drop _merge
		
	tab occSPej married, m	
	tab census1960 if married==1 & occSPej==., m
		
	rename census1960 husband_occ 
	
*--------------------------------------------------------------------------------------*	
*--------------------------------------------------------------------------------------*

*********************
**** FATHER'S OCCUPATION
*********************
/* NOTE: Unlike in some other surveys, father occupation and mother occupation 
         when growing up are provided as separate variables.
*/
	rename fatherocc census1960 
	
	merge m:1 census1960 using ../Crosswalks/Crosswalk_1960Census_toANES.dta
	assert census1960==-4 if _merge==1 
	tab census1960 if _merge==1, m 
		
	drop if _merge==2
	drop _merge

/* Note: As no variable related to father work organization 
         (i.e., private, public, self-employed), is available,
         it's not possible to tell if father was self-employed.
         Cannot do the usual fix to account for self-employment.
*/
	tab fatheroccej, m
	label var fatheroccej "Father's occupation, coarsened"

* Variable for father being either farm laborer or operator
	gen fatherfarm=0
	replace fatherfarm=. if fatheroccej==.
	replace fatherfarm=1 if fatheroccej==71 | fatheroccej==81
	label var fatherfarm "Father in farm occ."
	
*********************
**** MOTHER'S OCCUPATION
*********************
	sort motherocc

preserve
	use ../Crosswalks/Crosswalk_1960Census_toANES.dta, clear
	
	ren census1960 motherocc
	ren fatheroccej motheroccej
	sort motherocc
	
	tempfile tempie
	save `tempie'
	
restore

	merge m:1 motherocc using `tempie'
	assert motherocc==-4 if _merge==1 
	drop if _merge==2
	drop _merge

** Dummies for head of household when R was growing up 

/*Following the coding in other surveys: 
--headofhh_father =1 if R lived with both parents when growing up or if R reports having lived with only a father. 
--headofhh_mother =1 if R lived with a mother but not a father. 
--headofhh_othermale =1 if R lived with a male relative and not with R's parents.
--headofhh_otherfemale =1 if R lived with a female relative and not with R's parents or a male relative. 
*/

/*Note: In keeping with other surveys (e.g., NLSOM), respondents who answered 
        "on their own" or "other arrangement" will be assigned to "0" in the 
        dummies. */	

	gen headofhh_father =.
	replace headofhh_father = (inrange(R_wholivew_age15,1,4)) if R_wholivew_age15!=-4 
	tab headofhh_father,m 
	
	gen headofhh_mother =.
	replace headofhh_mother = (R_wholivew_age15==5) if R_wholivew_age15!=-4 
	tab headofhh_mother,m 
	
	gen headofhh_othermale =.
	replace headofhh_othermale = (R_wholivew_age15==6) if R_wholivew_age15!=-4 
	tab headofhh_othermale,m 
	
	gen headofhh_otherfemale =.
	replace headofhh_otherfemale = (R_wholivew_age15==7) if R_wholivew_age15!=-4 
	tab headofhh_otherfemale,m 

	//Alternate dummy for father as head of hh during R's childhood.  
	/*Note: When R reports occupation of a parent but does not report 
			who R lived with when growing up, will assume that R lived 
			with father.*/			
	gen headofhh_father_imputed = headofhh_father
	replace headofhh_father_imputed =1 if fatheroccej!=. & R_wholivew_age15==-4 
	tab headofhh_father_imputed ,m 
	label var headofhh_father_imputed "Impute dad when parent occ!=missing & no info about hh head when R was kid"

*****************************************************************************************************
* DUMMIES FOR WHEN WE KNOW WHY DAD OR MOM DIDN'T WORK (I.E. WHY FATHEROCCEJ/MOTHEROCCEJ IS MISSING) *
*****************************************************************************************************
	ren R0051300 dadwork_Rage15
	
	gen father_notworking =.
	replace father_notworking =1 if fatheroccej==. & dadwork_Rage15==0 & inlist(R_wholivew_age15,1,2,3,4,6)
	replace father_notworking =0 if fatheroccej!=.
	tab father_notworking, m 

	
	gen mother_notworking =.
	replace mother_notworking =1 if motheroccej==. & momwork_Rage15==0 & !inlist(R_wholivew_age15,4,6,8,9)
	replace mother_notworking =0 if motheroccej!=.
	tab mother_notworking, m 
		
*--------------------------------------------------------------------------------------*
*--------------------------------------------------------------------------------------*

*********************
*** BIRTH COHORTS 
*********************
	
* Assign everyone the decade in which they were born
	tab yearborn, m
	replace yearborn = 1900+yearborn if yearborn>=22 & yearborn<=37

	replace yearborn = 1967-age-1 if yearborn<1922 & interview_month<7
	replace yearborn = 1967-age if yearborn<1922 & interview_month>6
	rename yearborn dob
	label var dob "Year of birth"
	
	gen decade=.
	replace decade=1920 if dob>=1920 & dob<1930
	replace decade=1930 if dob>=1930 & dob<=1939
	label var decade "Decade of birth"
	
* Generate dummies for each decade
	tab decade, gen(decade_)
	
*--------------------------------------------------------------------------------------*
*--------------------------------------------------------------------------------------*

*******************
**** INTERACTIONS
*******************
	rename veteran_fam veteran
	global institution_list "veteran black hs_ed coll_ed"

* Demean the variables that we will use
	foreach var in $institution_list {
	sum `var'
	gen `var'_dm = `var'- `r(mean)'
	local temp: var label `var' 
	label var `var'_dm "`temp', demeaned" 
	}
	
	label var black_dm "black, demeaned" 

*********************
*** SAVE
*********************

	gen female=1

	* Unique identifier is R0000100
	ren R0000100 id_nlsmw 
	label var id_nlsmw "R ID (unique identifier)"

	drop R0* CPI*
	
	duplicates report id_nlsmw //no duplicates reported

	* Restrict sample and save
	compress
	sort id_nlsmw
	ren weight weight_nlsmw
	order id_nlsmw weight_nlsmw
	save ./output/NLSMW_analysis.dta, replace
